/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.indexer;
import net.nutch.io.*;
import net.nutch.util.LogFormatter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.document.Document;
import java.io.*;
import java.util.Vector;
import java.util.logging.Logger;
import java.security.MessageDigest;
/** Deletes duplicate documents in a set of Lucene indexes.
* Duplicates have either the same contents (via MD5 hash) or the same URL.
*/
public class DeleteDuplicates {
private static final Logger LOG =
LogFormatter.getLogger("net.nutch.indexer.DeleteDuplicates");
/** The key used in sorting for duplicates. */
public static class IndexedDoc implements WritableComparable {
private MD5Hash hash = new MD5Hash();
private float score;
private int index; // the segment index
private int doc; // within the index
private int urlLen;
public void write(DataOutput out) throws IOException {
hash.write(out);
out.writeFloat(score);
out.writeInt(index);
out.writeInt(doc);
out.writeInt(urlLen);
}
public void readFields(DataInput in) throws IOException {
hash.readFields(in);
this.score = in.readFloat();
this.index = in.readInt();
this.doc = in.readInt();
this.urlLen = in.readInt();
}
public int compareTo(Object o) {
throw new RuntimeException("this is never used");
}
/** Order equal hashes by decreasing score and increasing urlLen. */
public static class ByHashScore extends WritableComparator {
public ByHashScore() { super(IndexedDoc.class); }
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN);
if (c != 0)
return c;
float thisScore = readFloat(b1, s1+MD5Hash.MD5_LEN);
float thatScore = readFloat(b2, s2+MD5Hash.MD5_LEN);
if (thisScore < thatScore)
return 1;
else if (thisScore > thatScore)
return -1;
int thisUrlLen = readInt(b1, s1+MD5Hash.MD5_LEN+12);
int thatUrlLen = readInt(b2, s2+MD5Hash.MD5_LEN+12);
return thisUrlLen - thatUrlLen;
}
}
/** Order equal hashes by decreasing index and document. */
public static class ByHashDoc extends WritableComparator {
public ByHashDoc() { super(IndexedDoc.class); }
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2){
int c = compareBytes(b1, s1, MD5Hash.MD5_LEN, b2, s2, MD5Hash.MD5_LEN);
if (c != 0)
return c;
int thisIndex = readInt(b1, s1+MD5Hash.MD5_LEN+4);
int thatIndex = readInt(b2, s2+MD5Hash.MD5_LEN+4);
if (thisIndex != thatIndex)
return thatIndex - thisIndex;
int thisDoc = readInt(b1, s1+MD5Hash.MD5_LEN+8);
int thatDoc = readInt(b2, s2+MD5Hash.MD5_LEN+8);
return thatDoc - thisDoc;
}
}
}
private interface Hasher {
void updateHash(MD5Hash hash, Document doc);
}
private IndexReader[] readers;
private String tempFile;
/** Constructs a duplicate detector for the provided indexes. */
public DeleteDuplicates(IndexReader[] readers, String tempFile) {
this.readers = readers;
this.tempFile = tempFile;
}
/** Closes the indexes, saving changes. */
public void close() throws IOException {
for (int i = 0; i < readers.length; i++)
readers[i].close();
}
/** Delete pages with duplicate content hashes. Of those with the same
* content hash, keep the page with the highest score. */
public void deleteContentDuplicates() throws IOException {
LOG.info("Reading content hashes...");
computeHashes(new Hasher() {
public void updateHash(MD5Hash hash, Document doc) {
hash.setDigest(doc.get("digest"));
}
});
LOG.info("Sorting content hashes...");
SequenceFile.Sorter byHashScoreSorter =
new SequenceFile.Sorter(new IndexedDoc.ByHashScore(),NullWritable.class);
byHashScoreSorter.sort(tempFile, tempFile + ".sorted");
LOG.info("Deleting content duplicates...");
int duplicateCount = deleteDuplicates();
LOG.info("Deleted " + duplicateCount + " content duplicates.");
}
/** Delete pages with duplicate URLs. Of those with the same
* URL, keep the most recently fetched page. */
public void deleteUrlDuplicates() throws IOException {
final MessageDigest digest;
try {
digest = MessageDigest.getInstance("MD5");
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
LOG.info("Reading url hashes...");
computeHashes(new Hasher() {
public void updateHash(MD5Hash hash, Document doc) {
try {
digest.update(UTF8.getBytes(doc.get("url")));
digest.digest(hash.getDigest(), 0, MD5Hash.MD5_LEN);
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
}
});
LOG.info("Sorting url hashes...");
SequenceFile.Sorter byHashDocSorter =
new SequenceFile.Sorter(new IndexedDoc.ByHashDoc(), NullWritable.class);
byHashDocSorter.sort(tempFile, tempFile + ".sorted");
LOG.info("Deleting url duplicates...");
int duplicateCount = deleteDuplicates();
LOG.info("Deleted " + duplicateCount + " url duplicates.");
}
private void computeHashes(Hasher hasher) throws IOException {
IndexedDoc indexedDoc = new IndexedDoc();
SequenceFile.Writer writer =
new SequenceFile.Writer(tempFile, IndexedDoc.class, NullWritable.class);
try {
for (int index = 0; index < readers.length; index++) {
IndexReader reader = readers[index];
int readerMax = reader.maxDoc();
indexedDoc.index = index;
for (int doc = 0; doc < readerMax; doc++) {
if (!reader.isDeleted(doc)) {
Document document = reader.document(doc);
hasher.updateHash(indexedDoc.hash, document);
indexedDoc.score = Float.parseFloat(document.get("boost"));
indexedDoc.doc = doc;
indexedDoc.urlLen = document.get("url").length();
writer.append(indexedDoc, NullWritable.get());
}
}
}
} finally {
writer.close();
}
}
private int deleteDuplicates() throws IOException {
if (new File(tempFile).exists())
new File(tempFile).delete();
if (!new File(tempFile + ".sorted").renameTo(new File(tempFile)))
throw new IOException("Couldn't rename!");
IndexedDoc indexedDoc = new IndexedDoc();
SequenceFile.Reader reader = new SequenceFile.Reader(tempFile);
try {
int duplicateCount = 0;
MD5Hash prevHash = null; // previous hash
while (reader.next(indexedDoc, NullWritable.get())) {
if (prevHash == null) { // initialize prevHash
prevHash = new MD5Hash();
prevHash.set(indexedDoc.hash);
continue;
}
if (indexedDoc.hash.equals(prevHash)) { // found a duplicate
readers[indexedDoc.index].delete(indexedDoc.doc); // delete it
duplicateCount++;
} else {
prevHash.set(indexedDoc.hash); // reset prevHash
}
}
return duplicateCount;
} finally {
reader.close();
new File(tempFile).delete();
}
}
/** Delete duplicates in the indexes in the named directory. */
public static void main(String[] args) throws Exception {
String usage = "DeleteDuplicates <segmentsDir> <tempFile>";
if (args.length != 2) {
System.err.println("Usage: " + usage);
return;
}
String segmentsDir = args[0];
String tempFile = args[1];
File[] directories = new File(segmentsDir).listFiles();
Vector vReaders=new Vector();
//IndexReader[] readers = new IndexReader[directories.length];
int maxDoc = 0;
for (int i = 0; i < directories.length; i++) {
File indexDone = new File(directories[i], IndexSegment.DONE_NAME);
if(indexDone.exists() && indexDone.isFile()){
File indexDir = new File(directories[i], "index");
IndexReader reader = IndexReader.open(indexDir);
if (reader.hasDeletions()) {
LOG.info("Clearing old deletions in " + indexDir);
reader.undeleteAll();
}
maxDoc += reader.maxDoc();
vReaders.add(reader);
}
}
IndexReader[] readers=new IndexReader[vReaders.size()];
for(int i = 0; vReaders.size()>0; i++) {
readers[i]=(IndexReader)vReaders.remove(0);
}
DeleteDuplicates dd = new DeleteDuplicates(readers, tempFile);
dd.deleteUrlDuplicates();
dd.deleteContentDuplicates();
dd.close();
}
}